##############################################################################################
# Author: Subhendu Malakar
# Mentor: Mridul Sankar Barik
# About: This code inserts/updates the database with new information for osvdb.org website.
# Working: This code crawls the website and extracts information from
#		   the html page to feed into the database. This information is
#		   stored in the database in a proper format. There are 3 options:
#			1. Full Crawl : This crawls all the web pages.
#			2. Range Crawl : This crawls only the range of pages specified by the user.
#			3. Sync Crawl : This crawls new pages which are not present in the database.
##############################################################################################

# Importing HTMLParser module for parsing the html page downloaded.
from html.parser import HTMLParser

# urllib.request module contains functions for downloading web pages.
import urllib.request

# urllib.error module contains functions for exception handling.
import urllib.error

# Importing psycopg2 module to connect with the PostGRESQL database.
import psycopg2

# For exit()
import sys

# For string manipulation functions.
import string

# For multithreading.
import threading

# For counting the number of cpus in the machine.
import multiprocessing

# For timeout.
import time

# This defines the max number of pages to crawl. 
end=16

# Class name: MyHTMLParser
# Inherits: HTMLParser
# Functions: __init__(self,strict=False)
#			handle_starttag(self,tag,attrs)
#			handle_endtag(self,tag)
#			handle_data(self,tag)
#			get_description(self)
#			get_solutions(self)
#			get_cve_id(self)
#			get_cvss_score(self)
#			get_references(self)
#			get_products(self)
#			get_impact(self)
#			get_attack_from(self)
# Description: This class is responsible to extract data from a html page.
#			handle_starttag() is invoked automatically when a starting tag is encountered.
#			handle_endtag() is automatically invoked whnen an ending tag is encountered.
#			handle_data() is called when data is handled.
class MyHTMLParser(HTMLParser):

	# Constructor call.
	def __init__(self,strict=False):

		# Constructor call of parent class.
		HTMLParser.__init__(self,strict)

		# Defining variables of this class.
		# There are 3 types of variables.
		# data_variables: these are the required information
		# data_check_variables: boolean values corresponding to each of the data_variables
		# 		to keep a check on the data been already extracted or not.
		# tag_check_variables: used for matching the proper format.
		self.h1=False
		self.desc=False
		self.description=' '
		self.sol=False
		self.solution=' '
		self.p=False;
		self.li=False;
		self.ref=False;
		self.references=' '
		self.cvss=False;
		self.cvss_score=0.0;
		self.cve=False
		self.cve_id=' '
		self.links=[]
		self.prod=False;
		self.products=[]
		self.last_h6=' '
		self.h6=False
		self.h7=False
		self.clas=False
		self.attack_from=' '
		self.attk=False
		self.impact=' '
		self.impt=False
		
	# Automatically invoked when start tag is encountered. Used to initialize variables.	
	def handle_starttag(self, tag, attrs):
		check=False
		if tag == 'h1':
			self.h1=True;
		if tag == 'p':
			self.p=True;
		if tag =='li':
			self.li=True;
		if tag =='a':
			#self.a==True;
			if self.ref:
				for name,value in attrs:
					if name == 'href':
						self.links.append(value);
		if tag == 'h6':
			self.h6=True
		if tag == 'h7':
			self.h7 = True

	# Automatically invoked when end tag is encountered.
	def handle_endtag(self, tag):
		if tag == 'h1':
			self.h1=False
		if tag == 'p':
			self.p=False
		if tag == 'li':
			self.li=False
		if tag =='table':
			self.desc=False
			self.sol=False
			self.cvss=False
			self.cve=False
			self.ref=False
			self.prod=False
		if tag == 'h6':
			self.h6=False
		if tag == 'h7':
			self.h7=False
		
	# Automatically invoked when data is to be handled.
	def handle_data(self, data):

		# Removing leading and trailing whitespace characters.
		data=data.strip()

		# Check if valid data or not.
		if len(data) == 0:
			return

		# Extraction of information.
		# As soon as the data is extracted , the corresponding data_check_variable is reset to False to avoid ambiguity.
		if self.h6:
			self.last_h6=data.strip()+' '
		if self.p == True:
			if self.desc:
				self.desc=False;
				self.description=data;
			elif self.sol:
				self.sol=False;	
				self.solution=data;
			elif self.cvss:
				self.cvss=False
				try:
					data=data.strip(None)
					data=data[-5:]
					data=data.strip('=')
					self.cvss_score=float(data)
				except:
					self.cvss_score=0.0
					
		if self.prod and self.h7:
			self.products.append(self.last_h6+data.strip());
		if self.attk:
			data=data.strip(':').strip()
			if 'Remote' in data:
				self.attack_from+='Network '
			elif 'Network' in data:
				self.attack_from+='Network '
			if 'Local' in data:
				self.attack_from+='Local'
			self.attk=False

		# Condition checking.
		# Every information is kept in a format in the HTML page. To extract the information,
		# first check the conditions and if true for any of the data, then mark the corresponding
		# data_check_variable to true for next iteration where it will be extracted.
		if self.impt:
			self.impact=data.strip(':').strip()
			self.impt=False
		if self.clas and data == 'Location':
			self.attk=True
		if self.clas and data == 'Impact':
			self.impt=True
			self.clas=False
		if self.li :
			if self.cve:
				self.cve=False
				self.cve_id=data;
			elif 'CVE ID:' in data:
				self.cve = True;
		if self.h1 == True:
			if data == 'Description':
				self.desc=True
			elif data == 'Solution':
				self.sol=True
			elif data == 'References':
				self.ref=True
			elif data == 'CVSSv2 Score':
				self.cvss=True
			elif data == 'Products':
				self.prod=True
			elif data == 'Classification':
				self.clas=True

	# Returns description
	def get_description(self):
		return self.description

	# Returns solutions
	def get_solutions(self):
		return self.solution

	# Returns Base CVSS Score.
	def get_cvss_score(self):
		return self.cvss_score

	# Returns CVE ID (only the numeric part)
	def get_cve_id(self):
		return self.cve_id;

	# Returns links for further information.
	def get_references(self):
		return ','.join(self.links)

	# Returns affected products list.
	def get_products(self):
		return ','.join(self.products)

	# Returns the possible attack method:
	#		1. Network: from the network (LAN,internet,wifi,etc.)
	#		2. Local: locally(flash drives,etc.)
	def get_attack_from(self):
		return self.attack_from
		
	# Returns consequences of the vulnerablility		
	def get_impact(self):
		return self.impact

# class : MyThread
# inherits : threading.Thread
# Functions: __init__(self,bitmap,strt,END)
#			run(self)
#			common_function(self,conn,bitmap,strt,END)
# Description: This class enables the usage of threads for faster execution.
#			Each object generates its own working set and executes.
class MyThread (threading.Thread):

	# Constructor call
	def __init__(self,bitmap,strt,end):

		# Parent Constructor call
		threading.Thread.__init__(self)

		# Initializing in-object variables.
		# bitmap contains data for the pages already crawled.
		self.bitmap=bitmap

		# The start point
		self.strt=strt

		# The end point
		self.end=end

	# This is run by default when thread.start() is called to start the thread.	
	def run(self):
		try:

			# Creating a local connection for each thread.
			conn=psycopg2.connect("dbname='postgres' user='postgres' host='localhost' port='5432' password='postgres'")

		# If exception occurs.
		except:
			print('Database connection error.')

		# If everything goes fine.
		else:

			# Call the master function.
			self.common_function(conn,self.bitmap,self.strt,self.end)

		# Close the local connection to the database.
		conn.close()

	# This is the master function. it accepts a connection, start and end points and a bitmap of the pages already crawled.
	# This function tries to download all the pages between start and end points and store the information into the database.
	def common_function(self,conn,bitmap,strt=1,end=100000):

		# Common url for all the pages.
		url_head='http://www.osvdb.org/show/osvdb/'
		i=strt

		# Creating a cursor.
		cur=conn.cursor();
		while i<end:

			# If data from the page is not already in the database.
			if(not bitmap[i]):

				# Append the id to the common url.
				url=url_head+str(i)
	
				print('Fetching the url: ',url)

				# Try to fetch the HTML page.
				try:

					# Create a request object.
					request = urllib.request.Request(url)

					# Add header to the request.
					request.add_header("User-Agent", "My Crawler")

					# Create an opener object to handle the request.
					opener = urllib.request.build_opener()

					# Open the web page
					unformatted_html_page = opener.open(request)

				# Catch exceptions.
				except URLError:
					print('URL Error for: ',url)
				except HTTPError:
					print('HTTP Error for: ',url)
				except:
					print('unknown error to download web page.')

				# Decode the html page to utf-8 format for further work.
				html_page = unformatted_html_page.read().decode('utf-8')

				# Try to parse the html page.
				try:

					# Create a parse object.
					parse = MyHTMLParser()

					# Feed the html page to the parser.
					parse.feed(html_page)

				# Catch exceptions.	
				except HTMLParseError:
					print('HTMLParseError for: ',url)
				except:
					print('unknown error in parsing file.')
				else:
					print('Parsing Complete.')

				# Check if the result of parse is correct.
				# If correct result , then it will contain the CVE ID.
				# Else it will not contain any data.
				if parse.get_cve_id()!=' ':

					# Check if information about the CVE ID is already present or not.
					try:

						# Search for the CVE ID
						cur.execute("SELECT * FROM osvdb WHERE cve_id=%s",(parse.get_cve_id(),))

						# Result of an execute() statement is stored in a buffer.
						# To get the information, we have to call fetch*().
						try:
							cur.fetchone()[0]

						# If no such CVE ID is found , then it is to be added to the database.
						except:
							cur.execute("INSERT INTO osvdb (cve_id,osvdb_id,cvss_score,description,attack_from,impact,products,solutions,links) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)",
							(parse.get_cve_id(),i,parse.get_cvss_score(),parse.get_description(),parse.get_attack_from(),parse.get_impact(),parse.get_products(),parse.get_solutions(),parse.get_references()))
							print('Inserted in Database: ',parse.get_cve_id())

							# Commiting the changes.
							conn.commit()

						# If CVE ID is found, then the new result is to be updated.
						else:
							print('ERROR: ',parse.get_cve_id(),' already present in the Database.\n Updating...')
							cur.execute("UPDATE osvdb SET cvss_score=%s,description=%s,attack_from=%s,impact=%s,products=%s,solutions=%s,links=%s WHERE cve_id=%s",(parse.get_cvss_score(),parse.get_description(),parse.get_attack_from(),parse.get_impact(),parse.get_products(),parse.get_solutions(),parse.get_references(),parse.get_cve_id()))

							# Committing changes.
							conn.commit()

					# Catching exceptions.
					except Exception as e:
						print('ERROR: Error in Inserting data into Database. \n Error Details: ',end='\t')
						print(e)

				# No CVE ID found in the page.
				else:
					print('ERROR:\tNo CVE-ID found in the Page')
			else:
				pass
			i+=1

		# Close the cursor after use.
		cur.close()		

# Global function to set parameters for full crawling of websites.
def full_crawl():

	# Use the global END value.
	global end
	strt=1

	# Create a bitmap and set everything to false, i.e, everything is to be inserted/updated again.
	bitmap=[False]*end;
	return (bitmap,1,end)

# Global function to set parameters for crawling web pages in a user given range.
def range_crawl():

	# Take input from user
	strt=int(input('Enter the Start Position: '))
	end=int(input('Enter the Last Position: '))

	# Set the bitmap to false for the range given by the user.
	bitmap=[False]*end;
	return (bitmap,strt,end)

# Global function to set parameters for crawling web pages which are recently added, i.e, pages added to the website after the last full/sync crawl.
def sync_crawl(conn):

	# Use the global END value.
	global end
	i=1

	# Initialize the bitmap to false.
	bitmap=[False]*end;

	# Create a cursor for the connection to the database.
	# This is to check for the pages already in the database.
	cur=conn.cursor()
	while i<end:

		# Select the webpage from the database.
		cur.execute("SELECT * FROM osvdb WHERE osvdb_id=%s",(i,))
		try:
			cur.fetchone()[0]

		# If not found, then mark it as False. i.e, it is to be crawled.
		except Exception:
			bitmap[i]=False

		# Else it is marked as True to avoid re-crawling.
		else:
			bitmap[i]=True
		i+=1

	# Close the cursor after use.
	cur.close()
	return (bitmap,1,end)
	
# Global function to start the execution.
def main():

	# Create a connection to the database.
	try:
		conn=psycopg2.connect("dbname='postgres' user='postgres' host='localhost' port='5432' password='postgres'")

	# Catch exception.
	except:
		print('Database connection error.');

	# If everything is fine.
	else:

		# Ask user for further course of action.
		inp=int(input('1. Full Crawl\n2. Range Crawl\n3. Sync Crawl\nEnter your choice(1/2/3):'))
		if inp==1:
			bitmap,strt,end=full_crawl()
		elif inp==2:
			bitmap,strt,end=range_crawl()
		elif inp==3:
			bitmap,strt,end=sync_crawl(conn)
		else:
			print('Wrong Input\nExiting...')
			sys.exit(1)

		# Commit any changes to the database.
		conn.commit()
		i=0
		
		# Count the number of cpus in the machine.
		no_of_threads=multiprocessing.cpu_count()

		# List of threads.
		thread=[]
		step=int((end-strt)/no_of_threads)

		# Create threads and initialize it with the start and end value.
		while i< no_of_threads:
			thread.append(MyThread(bitmap,strt,strt+step))
			strt=strt+step;
			i+=1
		i=0

		# Start the threads.
		while i<no_of_threads:

			# It invokes Thread.run()
			thread[i].start()
			i+=1

		# Close the connection.
		conn.close()
	return 0

# Ensure main() is the first function to be called.
if __name__ == '__main__':
	main()
